sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.5.1 magrittr_1.5 tools_3.5.1 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.0 stringi_1.2.4 rmarkdown_1.11
## [9] knitr_1.20 stringr_1.3.1 digest_0.6.18 evaluate_0.12
output.var = params$output.var
log.pred = params$log.pred
eda = params$eda
algo.forward = params$algo.forward
algo.backward = params$algo.backward
algo.stepwise = params$algo.stepwise
algo.LASSO = params$algo.LASSO
algo.LARS = params$algo.LARS
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 8
## $ output.var : chr "y3"
## $ log.pred : logi TRUE
## $ eda : logi TRUE
## $ algo.forward : logi FALSE
## $ algo.backward: logi FALSE
## $ algo.stepwise: logi FALSE
## $ algo.LASSO : logi FALSE
## $ algo.LARS : logi FALSE
# Setup Labels
# alt.scale.label.name = Alternate Scale variable name
# - if predicting on log, then alt.scale is normal scale
# - if predicting on normal scale, then alt.scale is log scale
if (log.pred == TRUE){
label.names = paste('log.',output.var,sep="")
alt.scale.label.name = output.var
}
if (log.pred == FALSE){
label.names = output.var
alt.scale.label.name = paste('log.',output.var,sep="")
}
features = read.csv("../../Data/features.csv")
#str(features)
corr.matrix = round(cor(features[sapply(features, is.numeric)]),2)
# filter out only highly correlated variables
threshold = 0.6
corr.matrix.tmp = corr.matrix
diag(corr.matrix.tmp) = 0
high.corr = apply(abs(corr.matrix.tmp) >= threshold, 1, any)
high.corr.matrix = corr.matrix.tmp[high.corr, high.corr]
DT::datatable(corr.matrix)
DT::datatable(high.corr.matrix)
feature.names = colnames(features)
drops <- c('JobName')
feature.names = feature.names[!(feature.names %in% drops)]
#str(feature.names)
labels = read.csv("../../Data/labels.csv")
#str(labels)
labels = labels[,c("JobName", output.var)]
summary(labels)
## JobName y3
## Job_00001: 1 Min. : 95.91
## Job_00002: 1 1st Qu.:118.21
## Job_00003: 1 Median :123.99
## Job_00004: 1 Mean :125.36
## Job_00005: 1 3rd Qu.:131.06
## Job_00006: 1 Max. :193.73
## (Other) :9994 NA's :2497
data <- merge(features, labels, by = 'JobName')
drops <- c('JobName')
data = data[,(!colnames(data) %in% drops)]
#str(data)
#str(data)
if (log.pred == TRUE){
data[label.names] = log(data[alt.scale.label.name],10)
drops = c(alt.scale.label.name)
data = data[!(names(data) %in% drops)]
}
#str(data)
data = data[complete.cases(data),]
if (eda == TRUE){
corr.to.label =round(cor(dplyr::select(data,-one_of(label.names)),dplyr::select_at(data,label.names)),4)
DT::datatable(corr.to.label)
}
if (eda == TRUE){
vifDF = usdm::vif(select_at(data,feature.names)) %>% arrange(desc(VIF))
head(vifDF,10)
}
## Variables VIF
## 1 stat31 1.065342
## 2 stat202 1.063139
## 3 stat113 1.061198
## 4 x22 1.060427
## 5 stat200 1.060168
## 6 x6 1.058441
## 7 stat14 1.058339
## 8 stat147 1.058274
## 9 stat207 1.058175
## 10 stat215 1.058161
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
if (eda == TRUE){
hist(data[ ,label.names])
#hist(data[complete.cases(data),alt.scale.label.name])
}
# https://stackoverflow.com/questions/24648729/plot-one-numeric-variable-against-n-numeric-variables-in-n-plots
ind.pairs.plot <- function(data, xvars=NULL, yvar)
{
df <- data
if (is.null(xvars)) {
xvars = names(data[which(names(data)!=yvar)])
}
#choose a format to display charts
ncharts <- length(xvars)
for(i in 1:ncharts){
plot(df[,xvars[i]],df[,yvar], xlab = xvars[i], ylab = yvar)
}
}
if (eda == TRUE){
ind.pairs.plot(data, feature.names, label.names)
}
if(eda ==FALSE){
# x18 may need transformations
plot(data[,'x18'], data[,label.names], main = "Original Scatter Plot vs. x18", ylab = label.names, xlab = 'x18')
plot(sqrt(data[,'x18']), data[,label.names], main = "Original Scatter Plot vs. sqrt(x18)", ylab = label.names, xlab = 'sqrt(x18)')
# transforming x18
data$sqrt.x18 = sqrt(data$x18)
data = dplyr::select(data,-one_of('x18'))
# what about x7, x9?
# x11 looks like data is at discrete points after a while. Will this be a problem?
}
data = data[sample(nrow(data)),] # randomly shuffle data
split = sample.split(data[,label.names], SplitRatio = 0.8)
data.train = subset(data, split == TRUE)
data.test = subset(data, split == FALSE)
plot.diagnostics <- function(model, train) {
plot(model)
residuals = resid(model) # Plotted above in plot(lm.out)
r.standard = rstandard(model)
r.student = rstudent(model)
plot(predict(model,train),r.student,
ylab="Student Residuals", xlab="Predicted Values",
main="Student Residual Plot")
abline(0, 0)
plot(predict(model, train),r.standard,
ylab="Standard Residuals", xlab="Predicted Values",
main="Standard Residual Plot")
abline(0, 0)
abline(2, 0)
abline(-2, 0)
# Histogram
hist(r.student, freq=FALSE, main="Distribution of Studentized Residuals",
xlab="Studentized Residuals", ylab="Density", ylim=c(0,0.5))
# Create range of x-values for normal curve
xfit <- seq(min(r.student)-1, max(r.student)+1, length=40)
# Generate values from the normal distribution at the specified values
yfit <- (dnorm(xfit))
# Add the normal curve
lines(xfit, yfit, ylim=c(0,0.5))
}
n <- names(data.train)
formula <- as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~", paste(n[!n %in% label.names], collapse = " + ")))
grand.mean.formula = as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~ 1"))
print(formula)
## log.y3 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 +
## x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 +
## x22 + x23 + stat1 + stat2 + stat3 + stat4 + stat5 + stat6 +
## stat7 + stat8 + stat9 + stat10 + stat11 + stat12 + stat13 +
## stat14 + stat15 + stat16 + stat17 + stat18 + stat19 + stat20 +
## stat21 + stat22 + stat23 + stat24 + stat25 + stat26 + stat27 +
## stat28 + stat29 + stat30 + stat31 + stat32 + stat33 + stat34 +
## stat35 + stat36 + stat37 + stat38 + stat39 + stat40 + stat41 +
## stat42 + stat43 + stat44 + stat45 + stat46 + stat47 + stat48 +
## stat49 + stat50 + stat51 + stat52 + stat53 + stat54 + stat55 +
## stat56 + stat57 + stat58 + stat59 + stat60 + stat61 + stat62 +
## stat63 + stat64 + stat65 + stat66 + stat67 + stat68 + stat69 +
## stat70 + stat71 + stat72 + stat73 + stat74 + stat75 + stat76 +
## stat77 + stat78 + stat79 + stat80 + stat81 + stat82 + stat83 +
## stat84 + stat85 + stat86 + stat87 + stat88 + stat89 + stat90 +
## stat91 + stat92 + stat93 + stat94 + stat95 + stat96 + stat97 +
## stat98 + stat99 + stat100 + stat101 + stat102 + stat103 +
## stat104 + stat105 + stat106 + stat107 + stat108 + stat109 +
## stat110 + stat111 + stat112 + stat113 + stat114 + stat115 +
## stat116 + stat117 + stat118 + stat119 + stat120 + stat121 +
## stat122 + stat123 + stat124 + stat125 + stat126 + stat127 +
## stat128 + stat129 + stat130 + stat131 + stat132 + stat133 +
## stat134 + stat135 + stat136 + stat137 + stat138 + stat139 +
## stat140 + stat141 + stat142 + stat143 + stat144 + stat145 +
## stat146 + stat147 + stat148 + stat149 + stat150 + stat151 +
## stat152 + stat153 + stat154 + stat155 + stat156 + stat157 +
## stat158 + stat159 + stat160 + stat161 + stat162 + stat163 +
## stat164 + stat165 + stat166 + stat167 + stat168 + stat169 +
## stat170 + stat171 + stat172 + stat173 + stat174 + stat175 +
## stat176 + stat177 + stat178 + stat179 + stat180 + stat181 +
## stat182 + stat183 + stat184 + stat185 + stat186 + stat187 +
## stat188 + stat189 + stat190 + stat191 + stat192 + stat193 +
## stat194 + stat195 + stat196 + stat197 + stat198 + stat199 +
## stat200 + stat201 + stat202 + stat203 + stat204 + stat205 +
## stat206 + stat207 + stat208 + stat209 + stat210 + stat211 +
## stat212 + stat213 + stat214 + stat215 + stat216 + stat217
print(grand.mean.formula)
## log.y3 ~ 1
model.full = lm(formula , data.train)
summary(model.full)
##
## Call:
## lm(formula = formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.084381 -0.021044 -0.004689 0.016465 0.186417
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.991e+00 9.252e-03 215.234 < 2e-16 ***
## x1 -1.778e-04 6.469e-04 -0.275 0.783506
## x2 2.614e-04 4.135e-04 0.632 0.527339
## x3 1.110e-04 1.128e-04 0.984 0.324925
## x4 -3.884e-05 8.872e-06 -4.378 1.22e-05 ***
## x5 4.110e-04 2.911e-04 1.412 0.158109
## x6 2.404e-04 5.871e-04 0.410 0.682156
## x7 1.152e-02 6.300e-04 18.286 < 2e-16 ***
## x8 4.396e-04 1.469e-04 2.992 0.002783 **
## x9 3.101e-03 3.286e-04 9.438 < 2e-16 ***
## x10 1.453e-03 3.052e-04 4.762 1.97e-06 ***
## x11 1.939e+05 7.301e+04 2.655 0.007947 **
## x12 -8.972e-05 1.863e-04 -0.482 0.630030
## x13 1.250e-04 7.421e-05 1.685 0.092098 .
## x14 -5.789e-04 3.192e-04 -1.814 0.069744 .
## x15 -5.426e-06 3.047e-04 -0.018 0.985794
## x16 1.058e-03 2.100e-04 5.041 4.77e-07 ***
## x17 1.489e-03 3.209e-04 4.642 3.53e-06 ***
## x18 5.999e-03 2.241e-04 26.762 < 2e-16 ***
## x19 2.878e-04 1.642e-04 1.753 0.079583 .
## x20 -5.318e-04 1.126e-03 -0.472 0.636727
## x21 1.199e-04 4.180e-05 2.869 0.004138 **
## x22 -3.859e-04 3.404e-04 -1.133 0.257064
## x23 -8.337e-05 3.221e-04 -0.259 0.795785
## stat1 -3.616e-05 2.452e-04 -0.147 0.882788
## stat2 1.547e-04 2.443e-04 0.633 0.526443
## stat3 5.181e-04 2.460e-04 2.106 0.035229 *
## stat4 -5.047e-04 2.461e-04 -2.051 0.040313 *
## stat5 -1.668e-04 2.465e-04 -0.677 0.498537
## stat6 -9.581e-05 2.455e-04 -0.390 0.696317
## stat7 -9.974e-05 2.450e-04 -0.407 0.683933
## stat8 -2.072e-04 2.453e-04 -0.845 0.398275
## stat9 -1.237e-06 2.452e-04 -0.005 0.995977
## stat10 -2.820e-04 2.442e-04 -1.155 0.248218
## stat11 -1.236e-04 2.463e-04 -0.502 0.615809
## stat12 2.051e-04 2.445e-04 0.839 0.401609
## stat13 -2.999e-04 2.442e-04 -1.228 0.219375
## stat14 -8.128e-04 2.435e-04 -3.338 0.000849 ***
## stat15 -2.859e-04 2.421e-04 -1.181 0.237629
## stat16 1.428e-04 2.448e-04 0.584 0.559522
## stat17 5.120e-05 2.425e-04 0.211 0.832766
## stat18 -2.317e-04 2.438e-04 -0.950 0.342064
## stat19 2.470e-04 2.448e-04 1.009 0.313029
## stat20 -1.184e-04 2.441e-04 -0.485 0.627551
## stat21 2.964e-05 2.454e-04 0.121 0.903868
## stat22 -3.191e-04 2.463e-04 -1.295 0.195302
## stat23 6.710e-04 2.435e-04 2.756 0.005867 **
## stat24 -3.367e-04 2.447e-04 -1.376 0.168807
## stat25 -3.900e-04 2.449e-04 -1.592 0.111381
## stat26 -3.175e-04 2.453e-04 -1.295 0.195512
## stat27 1.809e-04 2.446e-04 0.740 0.459504
## stat28 7.814e-06 2.458e-04 0.032 0.974644
## stat29 2.682e-04 2.454e-04 1.093 0.274619
## stat30 1.192e-04 2.473e-04 0.482 0.629857
## stat31 -9.795e-05 2.464e-04 -0.397 0.691025
## stat32 8.302e-05 2.477e-04 0.335 0.737486
## stat33 -2.597e-04 2.443e-04 -1.063 0.287729
## stat34 1.751e-04 2.450e-04 0.715 0.474816
## stat35 -5.028e-04 2.458e-04 -2.045 0.040876 *
## stat36 1.090e-04 2.436e-04 0.447 0.654641
## stat37 -4.658e-04 2.476e-04 -1.881 0.060018 .
## stat38 2.904e-04 2.455e-04 1.183 0.236955
## stat39 -1.397e-04 2.433e-04 -0.574 0.565931
## stat40 -1.952e-04 2.444e-04 -0.799 0.424550
## stat41 -3.806e-04 2.428e-04 -1.567 0.117124
## stat42 -4.331e-04 2.437e-04 -1.777 0.075596 .
## stat43 -2.733e-04 2.454e-04 -1.114 0.265454
## stat44 8.283e-05 2.442e-04 0.339 0.734518
## stat45 -5.279e-04 2.442e-04 -2.162 0.030658 *
## stat46 2.831e-04 2.459e-04 1.151 0.249784
## stat47 1.648e-04 2.466e-04 0.668 0.503896
## stat48 4.257e-04 2.447e-04 1.740 0.081974 .
## stat49 3.364e-05 2.425e-04 0.139 0.889691
## stat50 3.425e-04 2.436e-04 1.406 0.159729
## stat51 3.250e-04 2.453e-04 1.325 0.185254
## stat52 -1.063e-04 2.460e-04 -0.432 0.665749
## stat53 -2.931e-04 2.461e-04 -1.191 0.233810
## stat54 -4.229e-04 2.463e-04 -1.717 0.086010 .
## stat55 2.602e-04 2.442e-04 1.066 0.286641
## stat56 -3.205e-04 2.448e-04 -1.309 0.190476
## stat57 -9.730e-05 2.420e-04 -0.402 0.687712
## stat58 -8.365e-05 2.436e-04 -0.343 0.731293
## stat59 2.193e-04 2.441e-04 0.899 0.368894
## stat60 3.568e-04 2.443e-04 1.460 0.144274
## stat61 -9.795e-05 2.453e-04 -0.399 0.689645
## stat62 -4.678e-05 2.450e-04 -0.191 0.848558
## stat63 3.456e-04 2.446e-04 1.413 0.157699
## stat64 -2.393e-04 2.425e-04 -0.987 0.323730
## stat65 -1.148e-04 2.455e-04 -0.468 0.639990
## stat66 1.280e-04 2.472e-04 0.518 0.604550
## stat67 1.234e-04 2.457e-04 0.502 0.615349
## stat68 -2.778e-04 2.457e-04 -1.131 0.258238
## stat69 -1.276e-04 2.452e-04 -0.520 0.602904
## stat70 3.471e-04 2.440e-04 1.423 0.154907
## stat71 7.451e-05 2.438e-04 0.306 0.759930
## stat72 8.108e-05 2.469e-04 0.328 0.742584
## stat73 4.192e-04 2.471e-04 1.697 0.089784 .
## stat74 1.359e-05 2.456e-04 0.055 0.955870
## stat75 -3.129e-04 2.467e-04 -1.268 0.204672
## stat76 7.657e-05 2.447e-04 0.313 0.754379
## stat77 -2.668e-04 2.465e-04 -1.083 0.279050
## stat78 -1.476e-04 2.467e-04 -0.598 0.549658
## stat79 6.783e-05 2.452e-04 0.277 0.782057
## stat80 1.591e-04 2.471e-04 0.644 0.519714
## stat81 4.252e-04 2.455e-04 1.732 0.083361 .
## stat82 2.242e-04 2.436e-04 0.920 0.357475
## stat83 -1.917e-04 2.450e-04 -0.782 0.434049
## stat84 -1.487e-04 2.442e-04 -0.609 0.542653
## stat85 1.147e-04 2.460e-04 0.466 0.640967
## stat86 -1.590e-05 2.446e-04 -0.065 0.948181
## stat87 -5.559e-04 2.453e-04 -2.266 0.023473 *
## stat88 -1.080e-04 2.428e-04 -0.445 0.656556
## stat89 -2.288e-04 2.447e-04 -0.935 0.349868
## stat90 -2.530e-04 2.450e-04 -1.033 0.301787
## stat91 -2.605e-04 2.442e-04 -1.067 0.286126
## stat92 -3.101e-04 2.453e-04 -1.264 0.206123
## stat93 -1.521e-04 2.469e-04 -0.616 0.538046
## stat94 -3.399e-04 2.457e-04 -1.383 0.166615
## stat95 4.036e-05 2.455e-04 0.164 0.869424
## stat96 -9.747e-05 2.448e-04 -0.398 0.690552
## stat97 2.186e-04 2.430e-04 0.900 0.368284
## stat98 3.444e-03 2.425e-04 14.206 < 2e-16 ***
## stat99 1.924e-04 2.457e-04 0.783 0.433711
## stat100 7.978e-04 2.447e-04 3.261 0.001118 **
## stat101 5.535e-05 2.465e-04 0.225 0.822319
## stat102 -1.265e-04 2.449e-04 -0.516 0.605653
## stat103 -3.548e-04 2.482e-04 -1.430 0.152884
## stat104 -2.746e-04 2.441e-04 -1.125 0.260521
## stat105 9.914e-05 2.435e-04 0.407 0.683936
## stat106 -2.415e-04 2.442e-04 -0.989 0.322719
## stat107 -1.933e-04 2.458e-04 -0.787 0.431552
## stat108 -2.947e-04 2.459e-04 -1.199 0.230766
## stat109 6.519e-06 2.436e-04 0.027 0.978648
## stat110 -3.554e-03 2.432e-04 -14.615 < 2e-16 ***
## stat111 -1.203e-04 2.463e-04 -0.488 0.625270
## stat112 5.211e-06 2.462e-04 0.021 0.983116
## stat113 -1.843e-04 2.457e-04 -0.750 0.453115
## stat114 1.618e-04 2.433e-04 0.665 0.506012
## stat115 -8.573e-05 2.435e-04 -0.352 0.724803
## stat116 3.732e-04 2.459e-04 1.518 0.129032
## stat117 -1.973e-07 2.461e-04 -0.001 0.999361
## stat118 -2.574e-04 2.446e-04 -1.052 0.292629
## stat119 3.944e-06 2.439e-04 0.016 0.987099
## stat120 -1.290e-04 2.430e-04 -0.531 0.595552
## stat121 -3.766e-04 2.454e-04 -1.535 0.124889
## stat122 -5.106e-05 2.431e-04 -0.210 0.833681
## stat123 1.794e-05 2.476e-04 0.072 0.942238
## stat124 -2.360e-04 2.451e-04 -0.963 0.335494
## stat125 5.225e-05 2.463e-04 0.212 0.831986
## stat126 1.696e-04 2.450e-04 0.692 0.488778
## stat127 7.369e-05 2.440e-04 0.302 0.762696
## stat128 -1.044e-06 2.449e-04 -0.004 0.996601
## stat129 9.258e-05 2.441e-04 0.379 0.704508
## stat130 1.269e-04 2.449e-04 0.518 0.604370
## stat131 1.659e-04 2.464e-04 0.673 0.500674
## stat132 -1.127e-04 2.432e-04 -0.463 0.643158
## stat133 1.069e-04 2.453e-04 0.436 0.663047
## stat134 -2.836e-04 2.446e-04 -1.160 0.246267
## stat135 -8.239e-05 2.448e-04 -0.337 0.736432
## stat136 -1.735e-04 2.452e-04 -0.707 0.479297
## stat137 -3.669e-05 2.427e-04 -0.151 0.879881
## stat138 -7.098e-05 2.445e-04 -0.290 0.771603
## stat139 2.190e-05 2.458e-04 0.089 0.929023
## stat140 -2.140e-05 2.430e-04 -0.088 0.929833
## stat141 3.770e-04 2.434e-04 1.549 0.121385
## stat142 -3.961e-05 2.482e-04 -0.160 0.873194
## stat143 2.285e-04 2.450e-04 0.932 0.351159
## stat144 4.989e-04 2.441e-04 2.044 0.041002 *
## stat145 1.275e-04 2.471e-04 0.516 0.605914
## stat146 -5.567e-04 2.469e-04 -2.254 0.024211 *
## stat147 -3.076e-04 2.466e-04 -1.247 0.212285
## stat148 -2.241e-04 2.428e-04 -0.923 0.356072
## stat149 -4.686e-04 2.469e-04 -1.898 0.057735 .
## stat150 4.899e-05 2.456e-04 0.199 0.841920
## stat151 -1.836e-04 2.461e-04 -0.746 0.455786
## stat152 -1.782e-04 2.452e-04 -0.727 0.467433
## stat153 2.387e-04 2.488e-04 0.959 0.337350
## stat154 7.632e-05 2.478e-04 0.308 0.758114
## stat155 -8.946e-05 2.442e-04 -0.366 0.714163
## stat156 1.649e-04 2.462e-04 0.670 0.503146
## stat157 -7.952e-05 2.428e-04 -0.327 0.743304
## stat158 -2.051e-04 2.474e-04 -0.829 0.407018
## stat159 -8.435e-05 2.435e-04 -0.346 0.729081
## stat160 -9.287e-06 2.457e-04 -0.038 0.969846
## stat161 4.286e-04 2.460e-04 1.743 0.081435 .
## stat162 8.260e-05 2.428e-04 0.340 0.733692
## stat163 1.294e-04 2.479e-04 0.522 0.601796
## stat164 3.002e-04 2.457e-04 1.222 0.221927
## stat165 -9.308e-05 2.448e-04 -0.380 0.703747
## stat166 -4.031e-04 2.432e-04 -1.657 0.097529 .
## stat167 -2.805e-04 2.452e-04 -1.144 0.252718
## stat168 -2.205e-04 2.437e-04 -0.905 0.365682
## stat169 4.275e-05 2.449e-04 0.175 0.861424
## stat170 -7.808e-05 2.448e-04 -0.319 0.749716
## stat171 2.141e-04 2.455e-04 0.872 0.383199
## stat172 4.107e-04 2.448e-04 1.677 0.093532 .
## stat173 -1.353e-04 2.453e-04 -0.552 0.581171
## stat174 -1.817e-04 2.456e-04 -0.740 0.459444
## stat175 -3.439e-04 2.455e-04 -1.401 0.161270
## stat176 6.551e-05 2.447e-04 0.268 0.788935
## stat177 3.197e-05 2.454e-04 0.130 0.896358
## stat178 -1.724e-04 2.467e-04 -0.699 0.484713
## stat179 9.777e-05 2.444e-04 0.400 0.689187
## stat180 1.523e-04 2.443e-04 0.623 0.533058
## stat181 2.416e-04 2.453e-04 0.985 0.324708
## stat182 -7.181e-05 2.455e-04 -0.292 0.769924
## stat183 2.171e-04 2.447e-04 0.887 0.375002
## stat184 1.533e-04 2.476e-04 0.619 0.536013
## stat185 -1.363e-04 2.410e-04 -0.565 0.571817
## stat186 -2.592e-04 2.466e-04 -1.051 0.293230
## stat187 -6.975e-04 2.435e-04 -2.864 0.004193 **
## stat188 -1.985e-04 2.438e-04 -0.814 0.415550
## stat189 1.749e-04 2.459e-04 0.711 0.476924
## stat190 2.842e-05 2.447e-04 0.116 0.907539
## stat191 -2.759e-04 2.448e-04 -1.127 0.259675
## stat192 9.705e-05 2.460e-04 0.395 0.693148
## stat193 -1.945e-04 2.476e-04 -0.786 0.432181
## stat194 -1.809e-04 2.436e-04 -0.743 0.457775
## stat195 4.676e-04 2.447e-04 1.911 0.056083 .
## stat196 -3.594e-04 2.484e-04 -1.447 0.147945
## stat197 1.132e-04 2.420e-04 0.468 0.639966
## stat198 -5.725e-04 2.454e-04 -2.333 0.019705 *
## stat199 3.342e-04 2.438e-04 1.371 0.170428
## stat200 -5.132e-04 2.424e-04 -2.117 0.034294 *
## stat201 4.143e-05 2.452e-04 0.169 0.865836
## stat202 -2.010e-04 2.468e-04 -0.814 0.415410
## stat203 -5.567e-05 2.447e-04 -0.228 0.820040
## stat204 -4.042e-04 2.440e-04 -1.657 0.097590 .
## stat205 -2.542e-04 2.436e-04 -1.044 0.296711
## stat206 -2.263e-05 2.469e-04 -0.092 0.926966
## stat207 4.118e-04 2.439e-04 1.689 0.091310 .
## stat208 2.396e-05 2.445e-04 0.098 0.921939
## stat209 1.889e-04 2.431e-04 0.777 0.437132
## stat210 -2.048e-04 2.467e-04 -0.830 0.406379
## stat211 -4.131e-05 2.448e-04 -0.169 0.866021
## stat212 -1.460e-04 2.450e-04 -0.596 0.551172
## stat213 -3.959e-05 2.464e-04 -0.161 0.872353
## stat214 -3.346e-04 2.443e-04 -1.370 0.170853
## stat215 -4.004e-04 2.452e-04 -1.633 0.102544
## stat216 -1.404e-04 2.465e-04 -0.570 0.568930
## stat217 1.454e-04 2.462e-04 0.591 0.554785
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0322 on 5761 degrees of freedom
## Multiple R-squared: 0.2554, Adjusted R-squared: 0.2243
## F-statistic: 8.232 on 240 and 5761 DF, p-value: < 2.2e-16
plot.diagnostics(model.full, data.train)
model.null = lm(grand.mean.formula, data.train)
summary(model.null)
##
## Call:
## lm(formula = grand.mean.formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.115119 -0.024167 -0.003381 0.020592 0.190194
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.096995 0.000472 4443 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03656 on 6001 degrees of freedom
plot.diagnostics(model.null, data.train)
## hat values (leverages) are all = 0.0001666111
## and there are no factor predictors; no plot no. 5
http://www.stat.columbia.edu/~martin/W2024/R10.pdf
if (algo.forward == TRUE){
t1 = Sys.time()
model.forward = step(model.null, scope=list(lower=model.null, upper=model.full), direction="forward")
summary(model.forward)
t2 = Sys.time()
print (paste("Time taken for Forward Selection: ",t2-t1, sep = ""))
}
if (algo.backward == TRUE){
# Takes too much time
t1 = Sys.time()
model.backward = step(model.full, data = data.train, direction="backward")
summary(model.backward)
t2 = Sys.time()
print (paste("Time taken for Backward Elimination: ",t2-t1, sep = ""))
}
if (algo.stepwise == TRUE){
t1 = Sys.time()
model.stepwise = step(model.null, scope=list(upper=model.full), data = data.train, direction="both")
summary(model.stepwise)
t2 = Sys.time()
print (paste("Time taken for Stepwise Selection: ",t2-t1, sep = ""))
}
if (algo.LASSO == TRUE){
t1 = Sys.time()
model.LASSO = cv.glmnet(as.matrix(data.train[,feature.names]), data.train[,label.names], nfolds = 5, standardize = TRUE)
summary(model.LASSO)
t2 = Sys.time()
print (paste("Time taken for LASSO: ",t2-t1, sep = ""))
plot(model.LASSO)
best_lambda = model.LASSO$lambda.1se
lasso_coef = model.LASSO$glmnet.fit$beta[ , model.LASSO$glmnet.fit$lambda == best_lambda]
print (lasso_coef)
lasso_coef [ abs(lasso_coef) > 0 ]
}
# summary(model.forward)
# summary(model.stepwise)